Library
library(tidyverse)
library(here)
library(readxl)
library(janitor)
library(stringr)
library(tidyr)
library(dplyr)
library(knitr)
library(ggplot2)
library(factoextra)
library(gt)
library(naniar)
library(ggcorrplot)
library(corrplot)
library(GGally)
library(randomForest)
library(pdp)
library(e1071)
library(dplyr)
library(caret)
library(plotly)
library(stargazer)
library(broom)
library(knitr)
library(kableExtra)
library(grid)
library(gridExtra)
library(patchwork)
library(glmnet)
Data Import
raw_bp <- read.csv(here("ZIPallFiles","AHSnpa11bp.csv"), header=TRUE)
raw_bb <- read.csv(here("ZIPallFiles","AHSnpa11bb.csv"), header=TRUE)
raw_bsp <- read.csv(here("ZIPallFiles","inp13bsp.csv"), header=TRUE)
raw_bcn <- read.csv(here("ZIPallFiles","inp13bcn.csv"), header=TRUE)
raw_bhh <- read.csv(here("ZIPallFiles","inp13bhh.csv"), header=TRUE)
Rename and Created
bp_bb <- inner_join(raw_bp, raw_bb, by = "ABSPID") %>%
rename(HypertensionStatus = HYPBC,
BodyMass = SABDYMS,
SocialStatus = SF2SA1QN,
PersonID = ABSPID,
Age = AGEC,
Gender = SEX,
BMI = BMISC,
Systolic = SYSTOL,
Diastolic = DIASTOL,
SmokeStatus = SMKSTAT,
AlcoholPercentage_Day1 = ALCPER1,
AlcoholPercentage_Day2 = ALCPER2,
Potassium_Day1 = POTAST1,
Potassium_Day2 = POTAST2,
Sodium_Day1 = SODIUMT1,
Sodium_Day2 = SODIUMT2,
FiberEnergy_Day1 = FIBRPER1,
FiberEnergy_Day2 = FIBRPER2,
CarbonhydrateEnergy_Day1 = CHOPER1,
CarbonhydrateEnergy_Day2 = CHOPER2,
EnergyBMR_Day1 = EIBMR1,
EnergyBMR_Day2 = EIBMR2) %>%
mutate(Identity = factor(0))
bsp_bhh<- left_join(raw_bsp, raw_bhh,by = c("ABSHID")) %>%
rename(BodyMass = SABDYMS,
SocialStatus = SF2SA1DB,
HouseID = ABSHID,
Age = AGEEC,
Gender = SEX,
BMI = BMISC,
Systolic = SYSTOL,
Diastolic = DIASTOL,
SmokeStatus = SMKSTAT,
AlcoholPercentage_Day1 = ALCPER1,
AlcoholPercentage_Day2 = ALCPER2,
Potassium_Day1 = POTAST1,
Potassium_Day2 = POTAST2,
Sodium_Day1 = SODIUMT1,
Sodium_Day2 = SODIUMT2,
FiberEnergy_Day1 = FIBRPER1,
FiberEnergy_Day2 = FIBRPER2,
CarbonhydrateEnergy_Day1 = CHOPER1,
CarbonhydrateEnergy_Day2 = CHOPER2,
EnergyBMR_Day1 = EIBMR1,
EnergyBMR_Day2 = EIBMR2)%>%
mutate(Identity = factor(1))
bcn <- raw_bcn %>%
rename(HouseID = ABSHID,
MedicalCondition = ICD10ME)
First Filter
bp_bb <- bp_bb %>%
mutate(EnergyBMR_Day1 = na_if(EnergyBMR_Day1, 997) %>% na_if(998),
EnergyBMR_Day2 = na_if(EnergyBMR_Day2, 997) %>% na_if(998),
EnergyBMR = (EnergyBMR_Day1 + EnergyBMR_Day2)/2) %>%
filter(
Age >= 18,
HypertensionStatus == 5,
BodyMass != 4,
(is.na(EnergyBMR) | EnergyBMR >= 0.9))
bsp_bhh <- bsp_bhh %>%
mutate(EnergyBMR_Day1 = na_if(EnergyBMR_Day1, 997) %>% na_if(998),
EnergyBMR_Day2 = na_if(EnergyBMR_Day2, 997) %>% na_if(998),
EnergyBMR = (EnergyBMR_Day1 + EnergyBMR_Day2)/2) %>%
filter(
Age >= 18,
BodyMass != 4,
!HouseID %in% bcn$HouseID[bcn$MedicalCondition %in% c(7, 20)],
(is.na(EnergyBMR) | EnergyBMR >= 0.9))
Merge
raw_data <- bind_rows(bp_bb,bsp_bhh)
Variable Selection
selected_data <- raw_data %>%
select(
PersonID, HouseID, SocialStatus, Age, Gender, BMI, Systolic, Diastolic, SmokeStatus, Identity,
AlcoholPercentage_Day1, AlcoholPercentage_Day2,
Potassium_Day1, Potassium_Day2,
Sodium_Day1, Sodium_Day2,
FiberEnergy_Day1, FiberEnergy_Day2,
CarbonhydrateEnergy_Day1, CarbonhydrateEnergy_Day2
)
Type Check 1
data.frame(Variable = names(selected_data), Type = sapply(selected_data, class)) %>%
gt() %>%
tab_header(
title = "Variable Names and Their Types"
) %>%
tab_caption(caption = md("Table 1: Variable types table"))
Table 1: Variable types table
| Variable Names and Their Types |
| Variable |
Type |
| PersonID |
character |
| HouseID |
character |
| SocialStatus |
integer |
| Age |
integer |
| Gender |
integer |
| BMI |
numeric |
| Systolic |
integer |
| Diastolic |
integer |
| SmokeStatus |
integer |
| Identity |
factor |
| AlcoholPercentage_Day1 |
numeric |
| AlcoholPercentage_Day2 |
numeric |
| Potassium_Day1 |
numeric |
| Potassium_Day2 |
numeric |
| Sodium_Day1 |
numeric |
| Sodium_Day2 |
numeric |
| FiberEnergy_Day1 |
numeric |
| FiberEnergy_Day2 |
numeric |
| CarbonhydrateEnergy_Day1 |
numeric |
| CarbonhydrateEnergy_Day2 |
numeric |
Na values + New Variables + Type Conversion
selected_data <- selected_data %>%
mutate(
across(c(Gender, SocialStatus, SmokeStatus), as.factor),
BMI = na_if(BMI, 98) %>% na_if(99),
Systolic = na_if(Systolic, 0) %>% na_if(998) %>% na_if(999),
Diastolic = na_if(Diastolic, 0) %>% na_if(998) %>% na_if(999),
Potassium = (Potassium_Day1 + Potassium_Day2) / 2,
Sodium = (Sodium_Day1 + Sodium_Day2) / 2,
FiberEnergy = (FiberEnergy_Day1 + FiberEnergy_Day2)/2,
CarbonhydrateEnergy = (CarbonhydrateEnergy_Day1 + CarbonhydrateEnergy_Day2) / 2,
AlcoholPercentage = (AlcoholPercentage_Day1 + AlcoholPercentage_Day2) /2,
PotassiumSodiumRatio = Sodium / Potassium
) %>%
select(PersonID,HouseID, SocialStatus, Age, Gender, BMI, Systolic, Diastolic, SmokeStatus, AlcoholPercentage, PotassiumSodiumRatio, FiberEnergy, CarbonhydrateEnergy, Identity)
Type Check 2
data.frame(Variable = names(selected_data), Type = sapply(selected_data, class)) %>%
gt() %>%
tab_header(
title = "Variable Names and Their Types"
) %>%
tab_caption(caption = md("Table 2: Variable types table after correction"))
Table 2: Variable types table after correction
| Variable Names and Their Types |
| Variable |
Type |
| PersonID |
character |
| HouseID |
character |
| SocialStatus |
factor |
| Age |
integer |
| Gender |
factor |
| BMI |
numeric |
| Systolic |
integer |
| Diastolic |
integer |
| SmokeStatus |
factor |
| AlcoholPercentage |
numeric |
| PotassiumSodiumRatio |
numeric |
| FiberEnergy |
numeric |
| CarbonhydrateEnergy |
numeric |
| Identity |
factor |
Duplicate Values
selected_data %>%
select(-PersonID) %>%
distinct() %>%
dim() %>%
tibble::tibble(Dimension = c("Rows", "Columns"), Count = .) %>%
gt() %>%
tab_header(title = "Dimensions of Unique Tibble") %>%
tab_caption(caption = md("Table 3: Dimensions of Unique Data Table"))
Table 3: Dimensions of Unique Data Table
| Dimensions of Unique Tibble |
| Dimension |
Count |
| Rows |
8449 |
| Columns |
13 |
Outliers and New Variables
normal_ranges <- list(Systolic = c(90, 180),
Diastolic = c(60, 120),
BMI = c(16, 47.5),
PotassiumSodiumRatio = c(0, 4),
AlcoholPercentage = c(0, 85),
CarbonhydrateEnergy = c(0, 100),
FiberEnergy = c(0, 100))
selected_data <- selected_data %>%
filter(
(is.na(Systolic) | (Systolic >= normal_ranges$Systolic[1] & Systolic <= normal_ranges$Systolic[2])),
(is.na(Diastolic) | (Diastolic >= normal_ranges$Diastolic[1] & Diastolic <= normal_ranges$Diastolic[2])),
(is.na(BMI) | (BMI >= normal_ranges$BMI[1] & BMI <= normal_ranges$BMI[2])),
(is.na(AlcoholPercentage) | (AlcoholPercentage >= normal_ranges$AlcoholPercentage[1] & AlcoholPercentage <= normal_ranges$AlcoholPercentage[2])),
(is.na(CarbonhydrateEnergy) | (CarbonhydrateEnergy >= normal_ranges$CarbonhydrateEnergy[1] & CarbonhydrateEnergy <= normal_ranges$CarbonhydrateEnergy[2])),
(is.na(FiberEnergy) | (FiberEnergy >= normal_ranges$FiberEnergy[1] & FiberEnergy <= normal_ranges$FiberEnergy[2])),
(is.na(PotassiumSodiumRatio) | (PotassiumSodiumRatio >= normal_ranges$PotassiumSodiumRatio[1] & PotassiumSodiumRatio <= normal_ranges$PotassiumSodiumRatio[2])))
Categrocial Distribution
custom_colors <- c("#540d6e", "#ee4266","#ffd23f","#3bceac","#0ead69")
categorical_vars <- selected_data %>%
select(where(is.factor)) %>%
names()
long_data <- selected_data %>%
pivot_longer(cols = all_of(categorical_vars), names_to = "Variable", values_to = "Value")
plot <- ggplot(long_data, aes(x = Value, fill = Variable)) +
geom_bar(aes(y = after_stat(prop), group = Variable), position = "dodge", show.legend = FALSE) +
facet_wrap(~ Variable, scales = "free", nrow = 2) +
scale_fill_manual(values = custom_colors) +
labs(title = "Overall Distribution of Categorical Variables",
y = "Proportion",
caption = "Figure 1: Proportions of Categorical Variables") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)

Merge Variable
selected_data <- selected_data %>%
mutate(SmokeStatus = recode_factor(SmokeStatus, "2" = "1", "3" = "1","4" = "2","5" = "3"))
Missing Values
selected_data %>%
select(-PersonID, -HouseID) %>%
miss_var_summary() %>%
gt() %>%
tab_header(title = "Missing Value") %>%
tab_caption(caption = md("Table 4: Missing Values in Selected Data")) %>%
cols_label(variable = "Variable Name", n_miss = "Missing Count", pct_miss = "Percentage Missing") %>%
fmt_number(columns = everything(), decimals = 2)
Table 4: Missing Values in Selected Data
| Missing Value |
| Variable Name |
Missing Count |
Percentage Missing |
| BMI |
1,242.00 |
15.7 |
| Systolic |
1,222.00 |
15.4 |
| Diastolic |
1,222.00 |
15.4 |
| SocialStatus |
0.00 |
0 |
| Age |
0.00 |
0 |
| Gender |
0.00 |
0 |
| SmokeStatus |
0.00 |
0 |
| AlcoholPercentage |
0.00 |
0 |
| PotassiumSodiumRatio |
0.00 |
0 |
| FiberEnergy |
0.00 |
0 |
| CarbonhydrateEnergy |
0.00 |
0 |
| Identity |
0.00 |
0 |
corr_matrix <- selected_data%>%
select(where(is.numeric)) %>%
mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .))) %>%
cor(use = "complete.obs")
ggcorrplot(corr_matrix, lab = TRUE,
title = "Correlation Heatmap for ALL",
colors = c("blue", "white", "red"),
tl.cex = 9, lab_size = 4) +
theme(legend.position = "none") +
labs(caption = "Figure 2: Correlation Heatmap for ALL Variables")

Regression Imputation
selected_data$BMI[is.na(selected_data$BMI)] <- lm(BMI ~ Gender + SocialStatus + Age + PotassiumSodiumRatio + FiberEnergy + CarbonhydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>%
predict(newdata = selected_data[is.na(selected_data$BMI), ])
selected_data$Systolic[is.na(selected_data$Systolic)] <- lm(Systolic ~ Gender + SocialStatus + Age + BMI + PotassiumSodiumRatio + FiberEnergy + CarbonhydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>%
predict(newdata = selected_data[is.na(selected_data$Systolic), ])
selected_data$Diastolic[is.na(selected_data$Diastolic)] <- lm(Diastolic ~ Gender + SocialStatus + Age + BMI + Systolic + PotassiumSodiumRatio + FiberEnergy + CarbonhydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>%
predict(newdata = selected_data[is.na(selected_data$Diastolic), ])
vis_miss(selected_data %>%
select(-PersonID, -HouseID)) +
ggtitle("Missing Data Heatmap for Selected Variables") +
labs(caption = "Figure 3: Visualization of Missing Data for Selected Variables")
# Hypertension variable
selected_data <- selected_data %>%
mutate(Hypertension = as.factor(case_when(Systolic >= 120 | Diastolic >= 80 ~ 1,
TRUE ~ 0)))
Normalization
numeric_cols <- sapply(selected_data, is.numeric)
normalized_data <- selected_data
normalized_data[numeric_cols] <- scale(selected_data[numeric_cols])
Multicollinearity
selected_data %>%
select(where(is.numeric), -Systolic, -Diastolic) %>%
ggscatmat() +
ggtitle("Scatter Matrix of Numeric Variables") +
labs(caption = "Figure 4: Scatter Matrix of Numeric Variables")

Logistic Regression
lr <- glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio,
data = normalized_data,
family = binomial())
summary(lr)
##
## Call:
## glm(formula = Hypertension ~ SocialStatus + Gender + Age + BMI +
## SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy +
## Identity + PotassiumSodiumRatio, family = binomial(), data = normalized_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.95072 0.07782 12.217 < 2e-16 ***
## SocialStatus2 -0.07198 0.07867 -0.915 0.36024
## SocialStatus3 -0.16746 0.08156 -2.053 0.04006 *
## SocialStatus4 -0.16674 0.08641 -1.930 0.05364 .
## SocialStatus5 -0.26239 0.08307 -3.159 0.00159 **
## Gender2 -0.83919 0.05232 -16.039 < 2e-16 ***
## Age 0.77836 0.03010 25.855 < 2e-16 ***
## BMI 0.50127 0.02795 17.935 < 2e-16 ***
## SmokeStatus2 -0.23045 0.07219 -3.192 0.00141 **
## SmokeStatus3 0.01697 0.06605 0.257 0.79722
## AlcoholPercentage 0.13033 0.02743 4.752 2.02e-06 ***
## FiberEnergy -0.01895 0.03585 -0.529 0.59703
## CarbonhydrateEnergy -0.07369 0.03297 -2.235 0.02539 *
## Identity1 0.04106 0.07537 0.545 0.58591
## PotassiumSodiumRatio -0.01901 0.02685 -0.708 0.47903
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10865.6 on 7931 degrees of freedom
## Residual deviance: 9196.2 on 7917 degrees of freedom
## AIC: 9226.2
##
## Number of Fisher Scoring iterations: 3
Random Forest
set.seed(3888)
rf <- randomForest(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, data = normalized_data, ntree = 500)
Support Vector Machine
final_data <- cbind(as.data.frame(predict(dummyVars(~ . - Hypertension - PersonID - HouseID - Systolic - Diastolic, data = normalized_data),
newdata = normalized_data)),
Hypertension = normalized_data$Hypertension)
set.seed(3888)
svm <- svm(Hypertension ~ .,
data = final_data,
kernel = "radial", cost = 1, gamma = 0.1)
Model comparison (5-fold cross validation)
set.seed(3888)
k_folds <- 5
n_repeats <- 3
logistic_accuracy <- numeric(k_folds * n_repeats)
logistic_f1 <- numeric(k_folds * n_repeats)
logistic_sensitivity <- numeric(k_folds * n_repeats)
logistic_specificity <- numeric(k_folds * n_repeats)
logistic_precision <- numeric(k_folds * n_repeats)
rf_accuracy <- numeric(k_folds * n_repeats)
rf_f1 <- numeric(k_folds * n_repeats)
rf_sensitivity <- numeric(k_folds * n_repeats)
rf_specificity <- numeric(k_folds * n_repeats)
rf_precision <- numeric(k_folds * n_repeats)
svm_accuracy <- numeric(k_folds * n_repeats)
svm_f1 <- numeric(k_folds * n_repeats)
svm_sensitivity <- numeric(k_folds * n_repeats)
svm_specificity <- numeric(k_folds * n_repeats)
svm_precision <- numeric(k_folds * n_repeats)
ensure_factor_levels <- function(predictions, reference) {
factor(predictions, levels = levels(reference))
}
for (repeat_idx in 1:n_repeats) {
folds <- createFolds(normalized_data$Hypertension, k = k_folds, list = TRUE, returnTrain = FALSE)
for (i in 1:k_folds) {
fold_idx <- (repeat_idx - 1) * k_folds + i
test_indices <- folds[[i]]
test_data <- normalized_data[test_indices, ]
train_data <- normalized_data[-test_indices, ]
log_model <- glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio,
data = train_data, family = binomial())
log_predictions <- predict(log_model, test_data, type = "response")
log_class <- ifelse(log_predictions > 0.5, 1, 0)
log_class <- ensure_factor_levels(log_class, test_data$Hypertension)
log_conf_matrix <- confusionMatrix(log_class, test_data$Hypertension, positive = "1")
logistic_accuracy[fold_idx] <- log_conf_matrix$overall['Accuracy']
logistic_f1[fold_idx] <- log_conf_matrix$byClass['F1']
logistic_sensitivity[fold_idx] <- log_conf_matrix$byClass['Sensitivity']
logistic_specificity[fold_idx] <- log_conf_matrix$byClass['Specificity']
logistic_precision[fold_idx] <- log_conf_matrix$byClass['Precision']
rf_model <- randomForest(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio,
data = train_data, ntree = 500)
rf_predictions <- predict(rf_model, test_data)
rf_predictions <- ensure_factor_levels(rf_predictions, test_data$Hypertension)
rf_conf_matrix <- confusionMatrix(rf_predictions, test_data$Hypertension, positive = "1")
rf_accuracy[fold_idx] <- rf_conf_matrix$overall['Accuracy']
rf_f1[fold_idx] <- rf_conf_matrix$byClass['F1']
rf_sensitivity[fold_idx] <- rf_conf_matrix$byClass['Sensitivity']
rf_specificity[fold_idx] <- rf_conf_matrix$byClass['Specificity']
rf_precision[fold_idx] <- rf_conf_matrix$byClass['Precision']
dummy_model <- dummyVars(~ . - Hypertension - PersonID - HouseID - Systolic - Diastolic, data = train_data)
encoded_train_data <- as.data.frame(predict(dummy_model, newdata = train_data))
encoded_test_data <- as.data.frame(predict(dummy_model, newdata = test_data))
encoded_train_data$Hypertension <- train_data$Hypertension
encoded_test_data$Hypertension <- test_data$Hypertension
svm_model <- svm(Hypertension ~ ., data = encoded_train_data, kernel = "radial", cost = 1, gamma = 0.1)
svm_predictions <- predict(svm_model, encoded_test_data)
svm_predictions <- ensure_factor_levels(svm_predictions, encoded_test_data$Hypertension)
svm_conf_matrix <- confusionMatrix(svm_predictions, encoded_test_data$Hypertension, positive = "1")
svm_accuracy[fold_idx] <- svm_conf_matrix$overall['Accuracy']
svm_f1[fold_idx] <- svm_conf_matrix$byClass['F1']
svm_sensitivity[fold_idx] <- svm_conf_matrix$byClass['Sensitivity']
svm_specificity[fold_idx] <- svm_conf_matrix$byClass['Specificity']
svm_precision[fold_idx] <- svm_conf_matrix$byClass['Precision']
}
}
ave_logistic_accuracy <- mean(logistic_accuracy)
ave_logistic_f1 <- mean(logistic_f1)
ave_logistic_sensitivity <- mean(logistic_sensitivity)
ave_logistic_specificity <- mean(logistic_specificity)
ave_logistic_precision <- mean(logistic_precision)
ave_rf_accuracy <- mean(rf_accuracy)
ave_rf_f1 <- mean(rf_f1)
ave_rf_sensitivity <- mean(rf_sensitivity)
ave_rf_specificity <- mean(rf_specificity)
ave_rf_precision <- mean(rf_precision)
ave_svm_accuracy <- mean(svm_accuracy)
ave_svm_f1 <- mean(svm_f1)
ave_svm_sensitivity <- mean(svm_sensitivity)
ave_svm_specificity <- mean(svm_specificity)
ave_svm_precision <- mean(svm_precision)
results_df <- data.frame(
Model = c("Logistic Regression", "Random Forest", "SVM"),
Average_Accuracy = c(ave_logistic_accuracy, ave_rf_accuracy, ave_svm_accuracy),
Average_F1 = c(ave_logistic_f1, ave_rf_f1, ave_svm_f1),
Average_Sensitivity = c(ave_logistic_sensitivity, ave_rf_sensitivity, ave_svm_sensitivity),
Average_Specificity = c(ave_logistic_specificity, ave_rf_specificity, ave_svm_specificity),
Average_Precision = c(ave_logistic_precision, ave_rf_precision, ave_svm_precision)
)
kable(results_df, format = "markdown", caption = "Table 5:Model Performance Metrics") %>%
kable_styling("striped", full_width = F)
Table 5:Model Performance Metrics
|
Model
|
Average_Accuracy
|
Average_F1
|
Average_Sensitivity
|
Average_Specificity
|
Average_Precision
|
|
Logistic Regression
|
0.7044885
|
0.7481466
|
0.7783494
|
0.6089291
|
0.7202971
|
|
Random Forest
|
0.6956228
|
0.7396327
|
0.7665783
|
0.6038241
|
0.7147264
|
|
SVM
|
0.6991105
|
0.7455815
|
0.7818504
|
0.5920600
|
0.7127107
|
results_long <- results_df %>%
pivot_longer(cols = starts_with("Average_"),
names_to = "Metric",
values_to = "Value") %>%
mutate(Metric = sub("Average_", "", Metric))
colors <- c('#90F1EF', '#ffd6e0', '#ffef9f')
fig <- plot_ly(results_long, x = ~Metric, y = ~round(Value, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(Value, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
title = "Figure 5: Model Performance Metrics Comparison",
yaxis = list(title = 'Score', range = c(0.5, 0.8)),
barmode = 'group',
paper_bgcolor = 'rgba(245, 246, 249, 1)',
showlegend = TRUE, # Show legend for models
margin = list(l = 50, r = 50, b = 100, t = 100, pad = 4),
plot_bgcolor = 'rgba(0, 0, 0, 0)'
)
fig
Feature Selection
Group by Age
normalized_data_young <- selected_data %>%
filter(Age < 50)
normalized_data_old <- selected_data %>%
filter(Age >= 50)
numeric_cols <- sapply(normalized_data_young, is.numeric)
normalized_data_young[numeric_cols] <- scale(normalized_data_young[numeric_cols])
numeric_cols <- sapply(normalized_data_old, is.numeric)
normalized_data_old[numeric_cols] <- scale(normalized_data_old[numeric_cols])
Group by Varibles
Set1
lr1 = glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, data = normalized_data, family = binomial())
lr2 = glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio,data = normalized_data_young, family = binomial())
lr3 = glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, data = normalized_data_old, family = binomial())
models1 <- list(lr1, lr2, lr3)
model_names1 <- c("Logistic Regression Model A(All, All)",
"Logistic Regression Model B(All, Young)",
"Logistic Regression Model C(All, Old)")
coef_plots1 <- list()
pvalue_plots1 <- list()
for (i in seq_along(models1)) {
model <- models1[[i]]
coef_df <- as.data.frame(coef(summary(model)))
names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
coef_df$Variable <- rownames(coef_df)
p <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names1[i]) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
coef_plots1[[i]] <- p
}
for (i in seq_along(models1)) {
model <- models1[[i]]
coef_df <- as.data.frame(coef(summary(model)))
names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
coef_df$Variable <- rownames(coef_df)
p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) +
geom_point() +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names1[i], y = "P-Value") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
pvalue_plots1[[i]] <- p
}
combined_coef_plot1 <- wrap_plots(coef_plots1, ncol = 3) +
plot_annotation(title = "Logistic Regression Coefficient Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_pvalue_plot2 <- wrap_plots(pvalue_plots1, ncol = 3) +
plot_annotation(title = "Logistic Regression P-Value Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_plot1 <- wrap_plots(combined_coef_plot1, combined_pvalue_plot2, nrow = 2)
combined_plot1

Set2
lr4 = glm(Hypertension ~ SocialStatus + SmokeStatus + Gender + AlcoholPercentage + Gender + BMI + FiberEnergy + CarbonhydrateEnergy + Age,data = normalized_data, family = binomial())
lr5 = glm(Hypertension ~ SocialStatus + SmokeStatus + Gender + BMI + AlcoholPercentage + Age + FiberEnergy + CarbonhydrateEnergy, data = normalized_data_young, family = binomial())
lr6 = glm(Hypertension ~ SocialStatus + Gender + BMI + Age + FiberEnergy + CarbonhydrateEnergy, data = normalized_data_old, family = binomial())
models2 <- list(lr4, lr5, lr6)
model_names2 <- c("Logistic Regression Model a(Few, All)",
"Logistic Regression Model b(Few, Young)",
"Logistic Regression Model c(Few, Old)")
coef_plots2 <- list()
pvalue_plots2 <- list()
for (i in seq_along(models2)) {
model <- models2[[i]]
coef_df <- as.data.frame(coef(summary(model)))
names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
coef_df$Variable <- rownames(coef_df)
p <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names2[i]) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
coef_plots2[[i]] <- p
}
for (i in seq_along(models1)) {
model <- models1[[i]]
coef_df <- as.data.frame(coef(summary(model)))
names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
coef_df$Variable <- rownames(coef_df)
p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) +
geom_point() +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names2[i], y = "P-Value") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
pvalue_plots2[[i]] <- p
}
combined_coef_plot3 <- wrap_plots(coef_plots2, ncol = 3) +
plot_annotation(title = "Logistic Regression Coefficient Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_pvalue_plot4 <- wrap_plots(pvalue_plots2, ncol = 3) +
plot_annotation(title = "Logistic Regression P-Value Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_plot2 <- wrap_plots(combined_coef_plot3, combined_pvalue_plot4, nrow = 2)
combined_plot2

Set3
lr7 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy,data = normalized_data, family = binomial())
lr8 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy, data = normalized_data_young, family = binomial())
lr9 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy, data = normalized_data_old, family = binomial())
models3 <- list(lr7, lr8, lr9)
model_names3 <- c("Logistic Regression Model i(Two, All)",
"Logistic Regression Model ii(Two, Young)",
"Logistic Regression Model iii(Two, Old)")
pvalue_plots3 <- list()
coef_plots3 <- list()
# Loop through each model to generate the p-value and coefficient plots
for (i in seq_along(models3)) {
model <- models3[[i]]
# Create a data frame of coefficients and p-values
coef_df <- as.data.frame(coef(summary(model)))
names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
coef_df$Variable <- rownames(coef_df)
# Create the p-value plot for the current model
p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) + # Use p-values here
geom_point() +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names3[i], y = "P-Value") + # Title for each model
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Center the title
# Store the p-value plot in the list
pvalue_plots3[[i]] <- p
# Create the coefficient plot for the current model
coef_plot <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names3[i]) + # Title for each model
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Center the title
# Store the coefficient plot in the list
coef_plots3[[i]] <- coef_plot
}
combined_coef_plot5 <- wrap_plots(coef_plots3, ncol = 3) +
plot_annotation(title = "Logistic Regression Coefficient Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_pvalue_plot6 <- wrap_plots(pvalue_plots3, ncol = 3) +
plot_annotation(title = "Logistic Regression P-Value Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_plot3 <- wrap_plots(combined_coef_plot5, combined_pvalue_plot6, nrow = 2)
combined_plot3

Full Plot
wrap_plots(combined_coef_plot1,
combined_coef_plot3,
combined_coef_plot5, nrow = 3)

wrap_plots(combined_pvalue_plot2,
combined_pvalue_plot4,
combined_pvalue_plot6, nrow = 3)

Lasso
x <- model.matrix(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio,
data = normalized_data)[, -1]
y <- normalized_data$Hypertension
lasso_model <- cv.glmnet(x, y, family = "binomial", alpha = 1)
best_lambda <- lasso_model$lambda.min
print(best_lambda)
## [1] 0.002258056
lasso_coefficients <- coef(lasso_model, s = best_lambda)
print(lasso_coefficients)
## 15 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.869479944
## SocialStatus2 .
## SocialStatus3 -0.081628276
## SocialStatus4 -0.076872408
## SocialStatus5 -0.174128820
## Gender2 -0.808526376
## Age 0.759028895
## BMI 0.487845075
## SmokeStatus2 -0.209441021
## SmokeStatus3 .
## AlcoholPercentage 0.116352372
## FiberEnergy -0.011067435
## CarbonhydrateEnergy -0.070421423
## Identity1 0.048984935
## PotassiumSodiumRatio -0.004548014
Group By Gender
normalized_data_male <- normalized_data %>%
filter(Gender == 1)
normalized_data_female <- normalized_data %>%
filter(Gender == 2)
Set4
lr10 = glm(Hypertension ~ SocialStatus + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio,data = normalized_data_male, family = binomial())
lr11 = glm(Hypertension ~ SocialStatus + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, data = normalized_data_female, family = binomial())
models4 <- list(lr1, lr10, lr11)
model_names4 <- c("Logistic Regression Model D(All, All)",
"Logistic Regression Model E(All, Male)",
"Logistic Regression Model F(All, Female")
pvalue_plots4 <- list()
coef_plots4 <- list()
# Loop through each model to generate the p-value and coefficient plots
for (i in seq_along(models4)) {
model <- models4[[i]]
# Create a data frame of coefficients and p-values
coef_df <- as.data.frame(coef(summary(model)))
names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
coef_df$Variable <- rownames(coef_df)
# Create the p-value plot for the current model
p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) + # Use p-values here
geom_point() +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names4[i], y = "P-Value") + # Title for each model
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Center the title
# Store the p-value plot in the list
pvalue_plots4[[i]] <- p
# Create the coefficient plot for the current model
coef_plot <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names4[i]) + # Title for each model
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Center the title
# Store the coefficient plot in the list
coef_plots4[[i]] <- coef_plot
}
combined_coef_plot7 <- wrap_plots(coef_plots4, ncol = 3) +
plot_annotation(title = "Logistic Regression Coefficient Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_pvalue_plot8 <- wrap_plots(pvalue_plots4, ncol = 3) +
plot_annotation(title = "Logistic Regression P-Value Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_plot4 <- wrap_plots(combined_coef_plot7, combined_pvalue_plot8, nrow = 2)
combined_plot4

Set5
lr12 = glm(Hypertension ~ SocialStatus + SmokeStatus + BMI + AlcoholPercentage + Age + FiberEnergy + CarbonhydrateEnergy, data = normalized_data_male, family = binomial())
lr13 = glm(Hypertension ~ SocialStatus + BMI + Age + FiberEnergy + CarbonhydrateEnergy, data = normalized_data_female, family = binomial())
models5 <- list(lr4, lr12, lr13)
model_names5 <- c("Logistic Regression Model d(Few, All)",
"Logistic Regression Model e(Few, Male)",
"Logistic Regression Model f(Few, Female)")
pvalue_plots5 <- list()
coef_plots5 <- list()
# Loop through each model to generate the p-value and coefficient plots
for (i in seq_along(models5)) {
model <- models5[[i]]
# Create a data frame of coefficients and p-values
coef_df <- as.data.frame(coef(summary(model)))
names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
coef_df$Variable <- rownames(coef_df)
# Create the p-value plot for the current model
p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) + # Use p-values here
geom_point() +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names5[i], y = "P-Value") + # Title for each model
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Center the title
# Store the p-value plot in the list
pvalue_plots5[[i]] <- p
# Create the coefficient plot for the current model
coef_plot <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names5[i]) + # Title for each model
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Center the title
# Store the coefficient plot in the list
coef_plots5[[i]] <- coef_plot
}
combined_coef_plot9 <- wrap_plots(coef_plots5, ncol = 3) +
plot_annotation(title = "Logistic Regression Coefficient Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_pvalue_plot10 <- wrap_plots(pvalue_plots5, ncol = 3) +
plot_annotation(title = "Logistic Regression P-Value Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_plot5 <- wrap_plots(combined_coef_plot9, combined_pvalue_plot10, nrow = 2)
combined_plot5
### Set6
lr14 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy, data = normalized_data_male, family = binomial())
lr15 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy, data = normalized_data_female, family = binomial())
models6 <- list(lr7, lr14, lr15)
model_names6 <- c("Logistic Regression Model iv(Two, All)",
"Logistic Regression Model v(Two, Male)",
"Logistic Regression Model vi(Two, Female)")
pvalue_plots6 <- list()
coef_plots6 <- list()
# Loop through each model to generate the p-value and coefficient plots
for (i in seq_along(models6)) {
model <- models6[[i]]
# Create a data frame of coefficients and p-values
coef_df <- as.data.frame(coef(summary(model)))
names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
coef_df$Variable <- rownames(coef_df)
# Create the p-value plot for the current model
p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) + # Use p-values here
geom_point() +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names6[i], y = "P-Value") + # Title for each model
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Center the title
# Store the p-value plot in the list
pvalue_plots6[[i]] <- p
# Create the coefficient plot for the current model
coef_plot <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = model_names6[i]) + # Title for each model
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Center the title
# Store the coefficient plot in the list
coef_plots6[[i]] <- coef_plot
}
combined_coef_plot11 <- wrap_plots(coef_plots6, ncol = 3) +
plot_annotation(title = "Logistic Regression Coefficient Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_pvalue_plot12 <- wrap_plots(pvalue_plots6, ncol = 3) +
plot_annotation(title = "Logistic Regression P-Value Plots",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
combined_plot6 <- wrap_plots(combined_coef_plot11, combined_pvalue_plot12, nrow = 2)
combined_plot6

Full Plot
wrap_plots(combined_coef_plot7,
combined_coef_plot9,
combined_coef_plot11, nrow = 3)

wrap_plots(combined_pvalue_plot8,
combined_pvalue_plot10,
combined_pvalue_plot12, nrow = 3)
